Clean
# Fix variable names
names(train) = c("file", "time", "weather", "smoke", "pelicans", "pods", "npods", "disturbance", "stageofnesting", "preds", "abandon", "pretty")
# Remove appostrophe from file
train$file = substr(train$file,1,nchar(train$file)-1)
# Factor and clean factors
train[,c(2:6,8:12)] <- lapply(train[,c(2:6,8:12)], factor)
levels(train$weather) = c("cloud", "cloud", "cloud", "rain", "sun")
# Make NAs meaningful
# Create a date variable for time series
train$date = substr(train$file,1,nchar(train$file)-4)
train$date = ymd_hms(train$date)
# Look at progress
glimpse(train)
Observations: 1,011
Variables: 13
$ file <chr> "20170309002000.jpg", "20170309014000.jpg", "20170309091500.jpg", "20170309162000.j...
$ time <fct> day, day, night, day, day, night, day, night, day, day, night, night, day, day, day...
$ weather <fct> sun, NA, NA, sun, sun, NA, cloud, NA, sun, cloud, NA, NA, sun, sun, sun, NA, NA, NA...
$ smoke <fct> FALSE, NA, NA, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, NA, NA, FALSE, FALSE, FAL...
$ pelicans <fct> FALSE, NA, NA, FALSE, FALSE, NA, FALSE, NA, TRUE, TRUE, NA, NA, TRUE, TRUE, FALSE, ...
$ pods <fct> FALSE, NA, NA, FALSE, FALSE, NA, FALSE, NA, TRUE, TRUE, NA, NA, TRUE, TRUE, FALSE, ...
$ npods <int> NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA,...
$ disturbance <fct> TRUE, NA, NA, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, NA, NA, FALSE, FALSE, FALS...
$ stageofnesting <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ preds <fct> FALSE, NA, NA, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, NA, NA, FALSE, FALSE, FAL...
$ abandon <fct> NA, NA, NA, NA, NA, NA, NA, NA, FALSE, FALSE, NA, NA, FALSE, FALSE, NA, NA, NA, NA,...
$ pretty <fct> NA, NA, NA, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, NA, NA, FALSE, FALSE, FALSE,...
$ date <dttm> 2017-03-09 00:20:00, 2017-03-09 01:40:00, 2017-03-09 09:15:00, 2017-03-09 16:20:00...
# Remove uninformative variables
train = train[,c(1:8,12,13)]